Adjectives in SLE

Author

Nina Funke

Published

16 Sep 2025

1 Data Preparation

Load packages

rm(list=ls(all=TRUE))
1source("_helpers/explorer.cat.r")
source("_helpers/explorer.num.r")
library(Boruta); library(caret); library(dplyr); library(MASS); library(partykit); library(pdp); library(randomForest); library(tree)
1
Both functions are created by Stefan Th. Gries.

Load the data

x <- read.delim(
  file="01a_adj.txt",
  stringsAsFactors=TRUE,
  dec=','
  )

Add persistence measure: positive values show that the previous comparison was analytic and negative values show that it was synthetic

x$PERSIST_DIST[x$PERSIST_DIST=="none"] <- NA
x$PERSIST_DIST <- as.numeric(as.character(x$PERSIST_DIST))
x$PERSISTENCE <- -(1-(x$PERSIST_DIST/1000))
x$PERSISTENCE[is.na(x$PERSISTENCE)] <- 0
x$PERSISTENCE[x$PERSIST_DIST>1000] <- 0
x$PERSISTENCE[x$PERSIST_COMP=='analytic'] <- -x$PERSISTENCE[x$PERSIST_COMP=='analytic']

PERSISTENCE now only includes those cases within 1000 characters before the actual adjective, so adjust the PERSIST_FORM variable accordingly to only count those cases:

x$PERSIST_FORM.1000 <- x$PERSIST_FORM
x$PERSIST_FORM.1000[x$PERSIST_DIST > 1000] <- 'none'

Add the Rhythm differences: if positive, the analytic pattern was better, if negative, the synthetic pattern was better

x$RHY_A <- ifelse(x$COMPARISON=='analytic', x$RHY_SCORE_POS, x$RHY_SCORE_ALT_POS)
x$RHY_S <- ifelse(x$COMPARISON=='synthetic', x$RHY_SCORE_POS, x$RHY_SCORE_ALT_POS)

x$RHY_DIFF <- x$RHY_S-x$RHY_A

Add the Segment differences: if positive, the analytic patterns was better, if negative, the synthetic pattern was better

x$SEG_A <- ifelse(x$COMPARISON=='analytic', x$SEG_SCORE, x$SEG_SCORE_ALT)
x$SEG_S <- ifelse(x$COMPARISON=='synthetic', x$SEG_SCORE, x$SEG_SCORE_ALT)

x$SEG_DIFF <- x$SEG_S-x$SEG_A
summary(x)
       NO              FILE              COMPARISON    ADJ_LEMMA  
 Min.   :150557   Min.   : 4703076   analytic :117   good   : 80  
 1st Qu.:156950   1st Qu.: 4705601   synthetic:329   high   : 36  
 Median :171688   Median : 4727066                   bad    : 21  
 Mean   :189707   Mean   :22743140                   large  : 19  
 3rd Qu.:219231   3rd Qu.:41786608                   low    : 17  
 Max.   :234528   Max.   :41855485                   great  : 16  
                                                     (Other):257  
      CORPUS         YEAR      VARIETY          NEWSPAPER     WORD_COUNT  
 NOW2020 :217   Min.   :2020   BrE:217   Daily Mail  :108   Min.   : 140  
 SAVE2020:229   1st Qu.:2020   LK :229   Daily Mirror:124   1st Qu.: 786  
                Median :2020             Daily News  :105   Median :1194  
                Mean   :2020             Independent :109   Mean   :1561  
                3rd Qu.:2020                                3rd Qu.:1803  
                Max.   :2020                                Max.   :5629  
                                                                          
          FORM        ADJ_LEN        READABILITY          LEXDIV        
 comparative:219   Min.   : 3.000   Min.   :-492.56   Min.   :-203.241  
 superlative:227   1st Qu.: 4.000   1st Qu.:-210.52   1st Qu.:   2.881  
                   Median : 4.000   Median : -19.68   Median :  17.506  
                   Mean   : 5.193   Mean   : 256.85   Mean   :  15.408  
                   3rd Qu.: 6.000   3rd Qu.: 351.55   3rd Qu.:  34.824  
                   Max.   :13.000   Max.   :4743.51   Max.   :  61.193  
                                                                        
 STRESS_LAST_SYLL    PERSIST_COMP      PERSIST_FORM  PERSIST_DIST    SYNT_FUN
 n:124            analytic : 84   comparative:160   Min.   :   0.0   a :348  
 y:322            none     :119   none       :119   1st Qu.: 164.5   n : 26  
                  synthetic:243   superlative:167   Median : 479.0   p : 70  
                                                    Mean   : 854.6   pn:  2  
                                                    3rd Qu.:1160.0           
                                                    Max.   :8062.0           
                                                    NA's   :119              
 ADVMOD  COMPL     ZIPF_FREQ        DPNOFREQ         RHY_SCORE_POS    
 n:411   i: 11   Min.   :2.438   Min.   :0.0000938   Min.   :0.00000  
 y: 35   n:398   1st Qu.:5.011   1st Qu.:0.0212153   1st Qu.:0.00000  
         p: 23   Median :5.524   Median :0.0323701   Median :0.00000  
         t: 14   Mean   :5.380   Mean   :0.0787586   Mean   :0.07869  
                 3rd Qu.:5.948   3rd Qu.:0.1563902   3rd Qu.:0.00000  
                 Max.   :6.377   Max.   :0.2237505   Max.   :1.00000  
                                                                      
 RHY_SCORE_ALT_POS   SEG_SCORE      SEG_SCORE_ALT    FINAL_SEGMENT   
 Min.   :0.0000    Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
 1st Qu.:0.0000    1st Qu.:0.1769   1st Qu.:0.1824   1st Qu.:0.2500  
 Median :0.1667    Median :0.2378   Median :0.2500   Median :0.5000  
 Mean   :0.1905    Mean   :0.2407   Mean   :0.2468   Mean   :0.4105  
 3rd Qu.:0.3333    3rd Qu.:0.3122   3rd Qu.:0.3152   3rd Qu.:0.6250  
 Max.   :1.0000    Max.   :0.6250   Max.   :0.5000   Max.   :1.0000  
                                                                     
  PERSISTENCE        PERSIST_FORM.1000     RHY_A            RHY_S       
 Min.   :-1.0000   comparative:114     Min.   :0.0000   Min.   :0.0000  
 1st Qu.:-0.6020   none       :221     1st Qu.:0.0000   1st Qu.:0.0000  
 Median : 0.0000   superlative:111     Median :0.1250   Median :0.0000  
 Mean   :-0.1638                       Mean   :0.1859   Mean   :0.0833  
 3rd Qu.: 0.0000                       3rd Qu.:0.3333   3rd Qu.:0.0000  
 Max.   : 1.0000                       Max.   :1.0000   Max.   :1.0000  
                                                                        
    RHY_DIFF           SEG_A            SEG_S           SEG_DIFF       
 Min.   :-1.0000   Min.   :0.0000   Min.   :0.0000   Min.   :-0.17273  
 1st Qu.:-0.3333   1st Qu.:0.2000   1st Qu.:0.1667   1st Qu.:-0.05706  
 Median : 0.0000   Median :0.2547   Median :0.2301   Median :-0.03507  
 Mean   :-0.1026   Mean   :0.2520   Mean   :0.2355   Mean   :-0.01644  
 3rd Qu.: 0.0000   3rd Qu.:0.3206   3rd Qu.:0.3079   3rd Qu.: 0.01389  
 Max.   : 0.6667   Max.   :0.5000   Max.   :0.6250   Max.   : 0.32143  
                                                                       

2 Data Exploration

The variables initially included are: ADJ_LEN, ADVMOD, COMPARISON, COMPL, DPNOFREQ, FINAL_SEGMENT, FORM, LEXDIV, NEWSPAPER, PERSIST_FORM.1000, PERSISTENCE, READABILITY, RHY_DIFF, RHY_A, RHY_S, SEG_DIFF, SEG_A, SEG_S, STRESS_LAST_SYLL, SYNT_FUN, VARIETY, WORD_COUNT, ZIPF_FREQ

2.1 Dependent Variable

Look at COMPARISON: 26.2% analytic, 73.8% synthetic

explorer.cat(x$COMPARISON)
$`Missing data`
[1] FALSE

$Frequencies
 analytic synthetic 
      117       329 

$Percentages
 analytic synthetic 
    0.262     0.738 

$`Freqs of freqs`
117 329 
  1   1 

$`Types and tokens`
 types (no NAs) Tokens (no NAs) 
              2             446 

2.2 Every Other Variable

Look at ADJ_LEN: 1st Qu = Median, but that doesn’t change in any of the suggested transformations, so leave it as is

explorer.num(x$ADJ_LEN)

$`Special data points`
Missing data        Zeros    Negatives 
       FALSE        FALSE        FALSE 

$Summary
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  3.000   4.000   4.000   5.193   6.000  13.000 

$`Types and tokens`
 Types Tokens 
    11    446 

$`Freqs of freqs`

  0   1   9  14  24  28  39  70  82 164 
  1   2   1   2   1   1   1   1   1   1 

$`Smallest 'meaningful' difference`
[1] 1

Look at ADVMOD: 92.2% no vs. 7.8% yes, very few adjectives are adverbially modified

explorer.cat(x$ADVMOD)
$`Missing data`
[1] FALSE

$Frequencies
  n   y 
411  35 

$Percentages
    n     y 
0.922 0.078 

$`Freqs of freqs`
 35 411 
  1   1 

$`Types and tokens`
 types (no NAs) Tokens (no NAs) 
              2             446 

Look at COMPL: 2.3% to-infinitive, 84.7% no complement, 8% prepositional phrase, 5% than-phrase; few complements overall; conflate to to-infinitive and prepositional phrase vs. no complement and than-phrase (see Mondorf (2014) for why than-phrases are not complements).

explorer.cat(x$COMPL)
$`Missing data`
[1] FALSE

$Frequencies
  i   n   p   t 
 11 398  23  14 

$Percentages
    i     n     p     t 
0.025 0.892 0.052 0.031 

$`Freqs of freqs`
 11  14  23 398 
  1   1   1   1 

$`Types and tokens`
 types (no NAs) Tokens (no NAs) 
              4             446 
tree(x$COMPARISON ~ x$COMPL)
node), split, n, deviance, yval, (yprob)
      * denotes terminal node

1) root 446 513.30 synthetic ( 0.2623 0.7377 )  
  2) x$COMPL: i,p 34  46.66 synthetic ( 0.4412 0.5588 ) *
  3) x$COMPL: n,t 412 461.20 synthetic ( 0.2476 0.7524 ) *
x$COMPL.confl <- x$COMPL
levels(x$COMPL.confl) <- c('y', 'n', 'y', 'n')

Look at COMPL.confl: 7.6% yes vs. 92.4% no

explorer.cat(x$COMPL.confl)
$`Missing data`
[1] FALSE

$Frequencies
  y   n 
 34 412 

$Percentages
    y     n 
0.076 0.924 

$`Freqs of freqs`
 34 412 
  1   1 

$`Types and tokens`
 types (no NAs) Tokens (no NAs) 
              2             446 

Look at DPNOFREQ: looks okay

explorer.num(x$DPNOFREQ)

$`Special data points`
Missing data        Zeros    Negatives 
       FALSE        FALSE        FALSE 

$Summary
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
0.0000938 0.0212153 0.0323701 0.0787586 0.1563902 0.2237505 

$`Types and tokens`
 Types Tokens 
   149    446 

$`Freqs of freqs`

 0  1  2  3  4  5  7  9 10 11 12 18 26 38 41 
 1 84 32  7  4  6  4  3  3  1  1  1  1  1  1 

$`Smallest 'meaningful' difference`
[1] 5.49e-06

Look at FINAL_SEGMENT: fine

explorer.num(x$FINAL_SEGMENT)

$`Special data points`
Missing data        Zeros    Negatives 
     "FALSE"   "TRUE:100"      "FALSE" 

$Summary
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0000  0.2500  0.5000  0.4105  0.6250  1.0000 

$`Types and tokens`
 Types Tokens 
    14    446 

$`Freqs of freqs`

  0   2  11  12  13  17  19  20  43  80 100 102 
  1   2   1   2   2   1   1   1   1   1   1   1 

$`Smallest 'meaningful' difference`
[1] 0.04166667

Look at FORM: 49.1% comparative vs. 50.9% superlative

explorer.cat(x$FORM)
$`Missing data`
[1] FALSE

$Frequencies
comparative superlative 
        219         227 

$Percentages
comparative superlative 
      0.491       0.509 

$`Freqs of freqs`
219 227 
  1   1 

$`Types and tokens`
 types (no NAs) Tokens (no NAs) 
              2             446 

Look at LEXDIV: one pretty low value, winsorize to lowest value in boxplot that is not an outlier (-43.82754)

explorer.num(x$LEXDIV)

$`Special data points`
Missing data        Zeros    Negatives 
     "FALSE"      "FALSE"    "TRUE:86" 

$Summary
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-203.241    2.881   17.506   15.408   34.824   61.193 

$`Types and tokens`
 Types Tokens 
   120    446 

$`Freqs of freqs`

 0  1  2  3  4  5  6  7  8  9 10 11 12 14 16 25 
 1 43 19 12 12  9  5  7  2  3  1  1  2  2  1  1 

$`Smallest 'meaningful' difference`
[1] 0.0196487
boxplot(x$LEXDIV)$stats[1,1] # -43.82754

[1] -43.82754
x$LEXDIV.win <- x$LEXDIV
x$LEXDIV.win[x$LEXDIV.win < -43.82754] <- -43.82754

Look at LEXDIV.win: better

explorer.num(x$LEXDIV.win)

$`Special data points`
Missing data        Zeros    Negatives 
     "FALSE"      "FALSE"    "TRUE:86" 

$Summary
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-43.828   2.881  17.506  16.629  34.824  61.193 

$`Types and tokens`
 Types Tokens 
   113    446 

$`Freqs of freqs`

 0  1  2  3  4  5  6  7  8  9 10 11 12 14 16 22 25 
 1 37 19 12 11  9  5  7  2  3  1  1  1  2  1  1  1 

$`Smallest 'meaningful' difference`
[1] 0.0196487

Look at NEWSPAPER: 24.2% Daily Mail vs. 27.8% Daily Mirror vs. 23.5% Daily News vs. 24.4% Independent

explorer.cat(x$NEWSPAPER)
$`Missing data`
[1] FALSE

$Frequencies
  Daily Mail Daily Mirror   Daily News  Independent 
         108          124          105          109 

$Percentages
  Daily Mail Daily Mirror   Daily News  Independent 
       0.242        0.278        0.235        0.244 

$`Freqs of freqs`
105 108 109 124 
  1   1   1   1 

$`Types and tokens`
 types (no NAs) Tokens (no NAs) 
              4             446 

Look at PERSIST_FORM.1000: 25.6% comparative vs. 49.6% none vs. 24.9% superlative

explorer.cat(x$PERSIST_FORM.1000)
$`Missing data`
[1] FALSE

$Frequencies
comparative        none superlative 
        114         221         111 

$Percentages
comparative        none superlative 
      0.256       0.496       0.249 

$`Freqs of freqs`
111 114 221 
  1   1   1 

$`Types and tokens`
 types (no NAs) Tokens (no NAs) 
              3             446 

Look at PERSISTENCE: about half of the values are 0, I could change that by raising the max. limit of PERSIST_DIST (right now it’s 1000 characters), but there are still 26.8% of data points without Persistence so this wouldn’t be fixed entirely; more negative than positive values because there are more synthetic comparisons

explorer.num(x$PERSISTENCE)

$`Special data points`
Missing data        Zeros    Negatives 
     "FALSE"   "TRUE:221"   "TRUE:163" 

$Summary
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-1.0000 -0.6020  0.0000 -0.1638  0.0000  1.0000 

$`Types and tokens`
 Types Tokens 
   189    446 

$`Freqs of freqs`

  0   1   2   3   8 221 
  1 161  22   4   1   1 

$`Smallest 'meaningful' difference`
[1] 0.001

Look at READABILITY: log it

explorer.num(x$READABILITY)

$`Special data points`
Missing data        Zeros    Negatives 
     "FALSE"      "FALSE"   "TRUE:236" 

$Summary
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-492.56 -210.52  -19.68  256.85  351.55 4743.51 

$`Types and tokens`
 Types Tokens 
   120    446 

$`Freqs of freqs`

 0  1  2  3  4  5  6  7  8  9 10 11 12 14 16 25 
 1 43 19 12 12  9  5  7  2  3  1  1  2  2  1  1 

$`Smallest 'meaningful' difference`
[1] 0.2674029
x$READABILITY.log <- log2(abs(x$READABILITY)) * sign(x$READABILITY)

Look at READABILITY.log: better

explorer.num(x$READABILITY.log)

$`Special data points`
Missing data        Zeros    Negatives 
     "FALSE"      "FALSE"   "TRUE:229" 

$Summary
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-8.9442 -7.7178 -4.2990 -0.1076  8.4576 12.2117 

$`Types and tokens`
 Types Tokens 
   120    446 

$`Freqs of freqs`

 0  1  2  3  4  5  6  7  8  9 10 11 12 14 16 25 
 1 43 19 12 12  9  5  7  2  3  1  1  2  2  1  1 

$`Smallest 'meaningful' difference`
[1] 0.001060231

Look at RHY_DIFF: a lot of zeros but none of the suggested transformations would fix that

explorer.num(x$RHY_DIFF)

$`Special data points`
Missing data        Zeros    Negatives 
     "FALSE"   "TRUE:222"   "TRUE:182" 

$Summary
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-1.0000 -0.3333  0.0000 -0.1026  0.0000  0.6667 

$`Types and tokens`
 Types Tokens 
    51    446 

$`Freqs of freqs`

  0   1   2   3   4   5   6   7  11  13  17  37  66 222 
  1  23  13   3   1   1   1   1   1   1   1   1   1   1 

$`Smallest 'meaningful' difference`
[1] 0.00297619

Look at RHY_A: looks fine

explorer.num(x$RHY_A)

$`Special data points`
Missing data        Zeros    Negatives 
     "FALSE"   "TRUE:207"      "FALSE" 

$Summary
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0000  0.0000  0.1250  0.1859  0.3333  1.0000 

$`Types and tokens`
 Types Tokens 
    26    446 

$`Freqs of freqs`

  0   1   2   3   4   5   6  20  24  53  90 207 
  1   7   7   2   2   1   2   1   1   1   1   1 

$`Smallest 'meaningful' difference`
[1] 0.00297619

Look at RHY_S: again a lot of zeros

explorer.num(x$RHY_S)

$`Special data points`
Missing data        Zeros    Negatives 
     "FALSE"   "TRUE:336"      "FALSE" 

$Summary
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0000  0.0000  0.0000  0.0833  0.0000  1.0000 

$`Types and tokens`
 Types Tokens 
    24    446 

$`Freqs of freqs`

  0   1   2   3   5   6  14  25  33 336 
  1  10   7   1   1   1   1   1   1   1 

$`Smallest 'meaningful' difference`
[1] 0.005952381

Look at SEG_DIFF: fine

explorer.num(x$SEG_DIFF)

$`Special data points`
Missing data        Zeros    Negatives 
     "FALSE"    "TRUE:37"   "TRUE:277" 

$Summary
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-0.17273 -0.05706 -0.03507 -0.01644  0.01389  0.32143 

$`Types and tokens`
 Types Tokens 
   264    446 

$`Freqs of freqs`

  0   1   2   3   4   5   6   7  37 
  1 175  41  22   7   4   4   2   1 

$`Smallest 'meaningful' difference`
[1] 4.768e-06

Look at SEG_A: fine

explorer.num(x$SEG_A)

$`Special data points`
Missing data        Zeros    Negatives 
     "FALSE"    "TRUE:10"      "FALSE" 

$Summary
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0000  0.2000  0.2547  0.2520  0.3206  0.5000 

$`Types and tokens`
 Types Tokens 
   172    446 

$`Freqs of freqs`

 0  1  2  3  4  5  6  7  8  9 10 11 13 16 24 
 1 95 27 17 10  5  1  4  4  2  2  1  2  1  1 

$`Smallest 'meaningful' difference`
[1] 2.3527e-05

Look at SEG_S: fine

explorer.num(x$SEG_S)

$`Special data points`
Missing data        Zeros    Negatives 
     "FALSE"    "TRUE:19"      "FALSE" 

$Summary
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0000  0.1667  0.2301  0.2355  0.3079  0.6250 

$`Types and tokens`
 Types Tokens 
   175    446 

$`Freqs of freqs`

  0   1   2   3   4   5   6   7   8  10  11  12  16  19 
  1 103  30  12   6   6   3   3   1   2   4   1   2   2 

$`Smallest 'meaningful' difference`
[1] 1.8038e-05

Look at STRESS_LAST_SYLL: 27.8% no vs. 72.2% yes

explorer.cat(x$STRESS_LAST_SYLL)
$`Missing data`
[1] FALSE

$Frequencies
  n   y 
124 322 

$Percentages
    n     y 
0.278 0.722 

$`Freqs of freqs`
124 322 
  1   1 

$`Types and tokens`
 types (no NAs) Tokens (no NAs) 
              2             446 

Look at SYNT_FUN: 78% attributive vs. 5.8% nominal vs. 15.7% predicative vs. 0.4% post-nominal; conflate to predicative and post-nominal vs. attributive and nominal

explorer.cat(x$SYNT_FUN)
$`Missing data`
[1] FALSE

$Frequencies
  a   n   p  pn 
348  26  70   2 

$Percentages
    a     n     p    pn 
0.780 0.058 0.157 0.004 

$`Freqs of freqs`
  2  26  70 348 
  1   1   1   1 

$`Types and tokens`
 types (no NAs) Tokens (no NAs) 
              4             446 
tree(x$COMPARISON ~ x$SYNT_FUN)
node), split, n, deviance, yval, (yprob)
      * denotes terminal node

1) root 446 513.30 synthetic ( 0.2623 0.7377 )  
  2) x$SYNT_FUN: p,pn 72  99.59 synthetic ( 0.4722 0.5278 ) *
  3) x$SYNT_FUN: a,n 374 395.90 synthetic ( 0.2219 0.7781 ) *
x$SYNT_FUN.confl <- x$SYNT_FUN
levels(x$SYNT_FUN.confl) <- c('a', 'a', 'p', 'p')

Look at SYNT_FUN.confl: 83.9% attributive vs. 16.1% predicative

explorer.cat(x$SYNT_FUN.confl)
$`Missing data`
[1] FALSE

$Frequencies
  a   p 
374  72 

$Percentages
    a     p 
0.839 0.161 

$`Freqs of freqs`
 72 374 
  1   1 

$`Types and tokens`
 types (no NAs) Tokens (no NAs) 
              2             446 

Look at VARIETY: 48.7% BrE vs. 51.3% LK

explorer.cat(x$VARIETY)
$`Missing data`
[1] FALSE

$Frequencies
BrE  LK 
217 229 

$Percentages
  BrE    LK 
0.487 0.513 

$`Freqs of freqs`
217 229 
  1   1 

$`Types and tokens`
 types (no NAs) Tokens (no NAs) 
              2             446 

Look at WORD_COUNT: log

explorer.num(x$WORD_COUNT)

$`Special data points`
Missing data        Zeros    Negatives 
       FALSE        FALSE        FALSE 

$Summary
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    140     786    1194    1561    1803    5629 

$`Types and tokens`
 Types Tokens 
   115    446 

$`Freqs of freqs`

 0  1  2  3  4  5  6  7  8  9 10 11 12 14 16 25 
 1 40 19 11 13  7  4  6  2  3  1  2  2  3  1  1 

$`Smallest 'meaningful' difference`
[1] 1
x$WORD_COUNT.log <- log2(x$WORD_COUNT)

Look at WORD_COUNT.log: better

explorer.num(x$WORD_COUNT.log)

$`Special data points`
Missing data        Zeros    Negatives 
       FALSE        FALSE        FALSE 

$Summary
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  7.129   9.618  10.222  10.213  10.816  12.459 

$`Types and tokens`
 Types Tokens 
   115    446 

$`Freqs of freqs`

 0  1  2  3  4  5  6  7  8  9 10 11 12 14 16 25 
 1 40 19 11 13  7  4  6  2  3  1  2  2  3  1  1 

$`Smallest 'meaningful' difference`
[1] 0.00135528

Look at ZIPF_FREQ: skewed to the left… boxcox

explorer.num(x$ZIPF_FREQ)

$`Special data points`
Missing data        Zeros    Negatives 
       FALSE        FALSE        FALSE 

$Summary
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  2.438   5.011   5.524   5.380   5.948   6.377 

$`Types and tokens`
 Types Tokens 
   146    446 

$`Freqs of freqs`

 0  1  2  3  4  5  7  9 10 11 12 18 26 38 41 
 1 81 30  8  5  6  4  3  3  1  1  1  1  1  1 

$`Smallest 'meaningful' difference`
[1] 5.1168e-05
b <- boxcox(lm(x$ZIPF_FREQ ~ 1))

(lambda <- b$x[which.max(b$y)])
[1] 2
x$ZIPF_FREQ.trans <- (x$ZIPF_FREQ^lambda - 1)/lambda

Look at ZIPF_FREQ.trans:

explorer.num(x$ZIPF_FREQ.trans)

$`Special data points`
Missing data        Zeros    Negatives 
       FALSE        FALSE        FALSE 

$Summary
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  2.473  12.056  14.756  14.282  17.192  19.836 

$`Types and tokens`
 Types Tokens 
   146    446 

$`Freqs of freqs`

 0  1  2  3  4  5  7  9 10 11 12 18 26 38 41 
 1 81 30  8  5  6  4  3  3  1  1  1  1  1  1 

$`Smallest 'meaningful' difference`
[1] 0.000319287

3 Manually add the interaction terms

The categorical variables

x$VARIETYxADVMOD <- x$VARIETY:x$ADVMOD
x$VARIETYxCOMPL.confl <- x$VARIETY:x$COMPL.confl
x$VARIETYxFORM <- x$VARIETY:x$FORM
x$VARIETYxNEWSPAPER <- x$VARIETY:x$NEWSPAPER
x$VARIETYxPERSIST_FORM.1000 <- x$VARIETY:x$PERSIST_FORM.1000
x$VARIETYxSTRESS_LAST_SYLL <- x$VARIETY:x$STRESS_LAST_SYLL
x$VARIETYxSYNT_FUN.confl <- x$VARIETY:x$SYNT_FUN.confl

The numeric variables - I went with ctree() because for RHY_DIFF the tree() function made too many splits to compute them & e.g. the 15 splits for ZIPF_FREQ are just not interpretable

  1. ADJ_LEN:
#(t1 <- tree(COMPARISON ~ ADJ_LEN, data=x)) # 4 levels
plot(t1 <- ctree(COMPARISON ~ ADJ_LEN, data=x)) # 4 levels

Split ADJ_LEN into 4 levels:

x$ADJ_LEN.cat <- cut(x$ADJ_LEN, c(-Inf, 4, 5, 7, Inf), labels=c('(0,4]', '(4,5]', '(5,7]', '(7,13]'))
  1. DPNOFREQ
#(t2 <- tree(COMPARISON ~ DPNOFREQ, data=x)) # 15 levels
plot(t2 <- ctree(COMPARISON ~ DPNOFREQ, data=x)) # 3 levels

Split DPNOFREQ into 3 levels:

x$DPNOFREQ.cat <- cut(x$DPNOFREQ, c(-Inf, 0.017, 0.162, Inf), labels=c('[0,0.017]', '(0.017,0.162]', '(0.162,1]'))
  1. FINAL_SEGMENT:
#(t3 <- tree(COMPARISON ~ FINAL_SEGMENT, data=x)) # 5 levels
plot(t3 <- ctree(COMPARISON ~ FINAL_SEGMENT, data=x))

Split FINAL_SEGMENT into 5 levels:

x$FINAL_SEGMENT.cat <- cut(x$FINAL_SEGMENT, c(-Inf, 0, 0.333, 0.625, 0.667, Inf), labels=c('0', '(0,0.333]', '(0.333,0.625]', '(0.625,0.667]', '(0.667,1]'))
  1. LEXDIV.win:
# (t3 <- tree(COMPARISON ~ LEXDIV.win, data=x)) # 7 levels
plot(t3 <- ctree(COMPARISON ~ LEXDIV.win, data=x))

No splits for LEXDIV.win

  1. PERSISTENCE:
#(t4 <- tree(COMPARISON ~ PERSISTENCE, data=x)) # 2 levels
plot(t4 <- ctree(COMPARISON ~ PERSISTENCE, data=x)) # 2 levels

Split PERSISTENCE into 2 levels:

x$PERSISTENCE.cat <- cut(x$PERSISTENCE, c(-Inf, 0.587, Inf), labels=c('[-1,0.587]', '(0.587,1]'))
  1. READABILITY.log:
#(t5 <- tree(COMPARISON ~ READABILITY, data=x)) # 3 levels
plot(t5 <- ctree(COMPARISON ~ READABILITY.log, data=x))

No split for READABILITY.log

  1. RHY_DIFF:
#(t6 <- tree(COMPARISON ~ RHY_DIFF, data=x))
plot(t6 <- ctree(COMPARISON ~ RHY_DIFF, data=x)) # 4 levels

Using ctree(), split RHY_DIFF into 4 levels:

x$RHY_DIFF.cat <- cut(x$RHY_DIFF, c(-Inf, -0.33, -0.25, -0.054, Inf), labels=c('[-1,-0.33]', '(-0.33,-0.25]', '(-0.25,-0.054]', '(0.054,1]'))
  1. RHY_A:
#(t7 <- tree(COMPARISON ~ RHY_A, data=x)) # 7 levels
plot(t7 <- ctree(COMPARISON ~ RHY_A, data=x)) # 4 levels

Split RHY_A into 4 levels:

x$RHY_A.cat <- cut(x$RHY_A, c(-Inf, 0.179, 0.225, 0.33, Inf), labels=c('[0,0.179]', '(0.179,0.225]', '(0.225,0.33]', '(0.33,1]'))
  1. RHY_S:
#(t8 <- tree(COMPARISON ~ RHY_S, data=x)) # 6 levels
plot(t8 <- ctree(COMPARISON ~ RHY_S, data=x))

Split RHY_S into 2 levels:

x$RHY_S.cat <- cut(x$RHY_S, c(-Inf, 0.217, Inf), labels=c('[0,0.217]', '(0.217,1]'))
  1. SEG_DIFF
(t9 <- tree(COMPARISON ~ SEG_DIFF, data=x)) # 5 levels
node), split, n, deviance, yval, (yprob)
      * denotes terminal node

 1) root 446 513.30 synthetic ( 0.2623 0.7377 )  
   2) SEG_DIFF < 0.154006 433 505.30 synthetic ( 0.2702 0.7298 )  
     4) SEG_DIFF < 0.0586081 398 448.70 synthetic ( 0.2513 0.7487 )  
       8) SEG_DIFF < 0.0414377 385 441.00 synthetic ( 0.2597 0.7403 ) *
       9) SEG_DIFF > 0.0414377 13   0.00 synthetic ( 0.0000 1.0000 ) *
     5) SEG_DIFF > 0.0586081 35  48.49 synthetic ( 0.4857 0.5143 )  
      10) SEG_DIFF < 0.0663055 8   0.00 analytic ( 1.0000 0.0000 ) *
      11) SEG_DIFF > 0.0663055 27  34.37 synthetic ( 0.3333 0.6667 ) *
   3) SEG_DIFF > 0.154006 13   0.00 synthetic ( 0.0000 1.0000 ) *
plot(t9 <- ctree(COMPARISON ~ SEG_DIFF, data=x))

No split for SEG_DIFF.

  1. SEG_A
(t10 <- tree(COMPARISON ~ SEG_A, data=x)) # 2 levels
node), split, n, deviance, yval, (yprob)
      * denotes terminal node

1) root 446 513.300 synthetic ( 0.26233 0.73767 )  
  2) SEG_A < 0.364358 399 481.000 synthetic ( 0.29073 0.70927 ) *
  3) SEG_A > 0.364358 47   9.679 synthetic ( 0.02128 0.97872 ) *
plot(t10 <- ctree(COMPARISON ~ SEG_A, data=x))

No split for SEG_A.

  1. SEG_S
#(t11 <- tree(COMPARISON ~ SEG_S, data=x)) # 9 levels
plot(t11 <- ctree(COMPARISON ~ SEG_S, data=x))

Split SEG_S into 2 levels

x$SEG_S.cat <- cut(x$SEG_S, c(-Inf, 0.327,  Inf), labels=c('[0,0.327]', '(0.327,1]'))
  1. WORD_COUNT.log:
#(t12 <- tree(COMPARISON ~ WORD_COUNT, data=x)) # 8 levels
plot(t12 <- ctree(COMPARISON ~ WORD_COUNT.log, data=x))

No splits for WORD_COUNT.log

  1. ZIPF_FREQ.trans:
#(t13 <- tree(COMPARISON ~ ZIPF_FREQ, data=x)) # 15 levels
plot(t13 <- ctree(COMPARISON ~ ZIPF_FREQ.trans, data=x))

Split ZIPF_FREQ into 4 levels:

x$ZIPF_FREQ.cat <- cut(x$ZIPF_FREQ.trans, c(-Inf, 13.293, 14.756, 14.942,  Inf), labels=c('[-0.5,13.293]', '(13.293,14.756]', '(14.756,14.942]', '(14.942,24]'))
x$VARIETYxADJ_LEN.cat <- x$VARIETY:x$ADJ_LEN.cat
x$VARIETYxDPNOFREQ.cat <- x$VARIETY:x$DPNOFREQ.cat
x$VARIETYxFINAL_SEGMENT.cat <- x$VARIETY:x$FINAL_SEGMENT.cat
x$VARIETYxPERSISTENCE.cat <- x$VARIETY:x$PERSISTENCE.cat
x$VARIETYxRHY_DIFF.cat <- x$VARIETY:x$RHY_DIFF.cat
x$VARIETYxRHY_A.cat <- x$VARIETY:x$RHY_A.cat
x$VARIETYxRHY_S.cat <- x$VARIETY:x$RHY_S.cat
x$VARIETYxSEG_S.cat <- x$VARIETY:x$SEG_S.cat
x$VARIETYxZIPF_FREQ.cat <- x$VARIETY:x$ZIPF_FREQ.cat

4 Model Preparation

Variabes to include in the model: ADJ_LEN, DPNOFREQ, FINAL_SEGMENT, LEXDIV.win, PERSISTENCE, READABILITY.log, RHY_DIFF, RHY_A, RHY_S, SEG_DIFF, SEG_A, SEG_A, WORD_COUNT.log, ZIPF_FREQ.trans, ADVMOD, COMPL.confl, FORM, NEWSPAPER, PERSIST_FORM.1000, STRESS_LAST_SYLL, SYNT_FUN.confl, VARIETY, VARIETYxADJ_LEN.cat, VARIETYxDPNOFREQ.cat , VARIETYxFINAL_SEGMENT.cat, VARIETYxPERSISTENCE.cat, VARIETYxRHY_DIFF.cat, VARIETYxRHY_A.cat, VARIETYxRHY_S.cat, VARIETYxSEG_S.cat, VARIETYxZIPF_FREQ.cat, VARIETYxADVMOD,VARIETYxCOMPL.confl, VARIETYxFORM,VARIETYxNEWSPAPER, VARIETYxPERSIST_FORM.1000, VARIETYxSTRESS_LAST_SYLL, VARIETYxSYNT_FUN.confl

Compute the two baselines: 73.8% is the one to beat

c("baseline 1"=baseline.1 <- max(prop.table(table(x$COMPARISON))),
  "baseline 2"=baseline.2 <- sum(prop.table(table(x$COMPARISON))^2))
baseline 1 baseline 2 
 0.7376682  0.6129723 

Variable Selection Boruta suggests:
ADJ_LEN, DPNOFREQ, FINAL_SEGMENT, LEXDIV.win, PERSISTENCE, READABILITY.log, RHY_DIFF, RHY_A, SEG_DIFF, SEG_A, SEG_A, WORD_COUNT.log, ZIPF_FREQ.trans, STRESS_LAST_SYLL, SYNT_FUN.confl, VARIETYxADJ_LEN.cat, VARIETYxDPNOFREQ.cat , VARIETYxFINAL_SEGMENT.cat, VARIETYxRHY_DIFF.cat, VARIETYxRHY_A.cat, VARIETYxRHY_S.cat, VARIETYxSEG_S.cat, VARIETYxZIPF_FREQ.cat, VARIETYxADVMOD,VARIETYxCOMPL.confl, VARIETYxFORM, VARIETYxPERSIST_FORM.1000, VARIETYxSTRESS_LAST_SYLL, VARIETYxSYNT_FUN.confl

Exclude:
ADVMOD, COMPL.confl, FORM, NEWSPAPER, PERSIST_FORM.1000, RHY_S, VARIETY, VARIETYxPERSISTENCE.cat, VARIETYxNEWSPAPER, VARIETYxFORM

set.seed(sum(utf8ToInt("All the Young Dudes")))
predictors <- Boruta(COMPARISON ~ ADJ_LEN + ADVMOD + COMPL.confl + DPNOFREQ + FINAL_SEGMENT + FORM +  LEXDIV.win + NEWSPAPER + PERSIST_FORM.1000 + PERSISTENCE + READABILITY.log + RHY_DIFF + RHY_A + RHY_S + SEG_DIFF + SEG_A + SEG_S + STRESS_LAST_SYLL + SYNT_FUN.confl + VARIETY + WORD_COUNT.log + ZIPF_FREQ.trans + VARIETYxADJ_LEN.cat + VARIETYxDPNOFREQ.cat + VARIETYxFINAL_SEGMENT.cat +  VARIETYxPERSISTENCE.cat + VARIETYxRHY_DIFF.cat + VARIETYxRHY_A.cat + VARIETYxRHY_S.cat + VARIETYxSEG_S.cat + VARIETYxZIPF_FREQ.cat + VARIETYxADVMOD + VARIETYxCOMPL.confl + VARIETYxFORM + VARIETYxNEWSPAPER + VARIETYxPERSIST_FORM.1000 + VARIETYxSTRESS_LAST_SYLL + VARIETYxSYNT_FUN.confl, data=x, maxRuns=200)

attStats(predictors)
                              meanImp  medianImp      minImp     maxImp
VARIETYxSYNT_FUN.confl     3.54578446  3.4638071  1.64966908  5.6261129
VARIETYxSTRESS_LAST_SYLL  12.44683773 12.4118911 11.20183095 14.3581703
VARIETYxPERSIST_FORM.1000  2.53623705  2.5048933  1.16104112  4.5512077
VARIETYxNEWSPAPER          1.93250701  1.7795970  0.88231118  3.3473140
VARIETYxFORM               2.35650839  2.3187569  0.53085762  3.8383047
VARIETYxCOMPL.confl        2.40712788  2.4046307  1.11279383  3.9329763
VARIETYxADVMOD             2.31672896  2.3179899  0.49107400  4.0874927
VARIETYxZIPF_FREQ.cat      8.07115224  8.0270622  6.61081010  9.3860573
VARIETYxSEG_S.cat          5.26019305  5.2166157  3.13493937  7.3416357
VARIETYxRHY_S.cat          2.60748819  2.5915440  1.22571884  4.1086403
VARIETYxRHY_A.cat          6.15898981  6.2087533  4.51892337  7.6294173
VARIETYxRHY_DIFF.cat       4.92380164  4.9571398  3.39509718  6.4740550
VARIETYxPERSISTENCE.cat    2.22943727  2.2681966  0.43409144  3.3567433
VARIETYxFINAL_SEGMENT.cat  9.26702857  9.2726826  7.66759675 11.2296263
VARIETYxDPNOFREQ.cat       6.98797017  6.9885349  5.68062101  8.5891499
VARIETYxADJ_LEN.cat       15.76010524 15.7633681 13.89847136 17.4338296
ZIPF_FREQ.trans           13.41959471 13.4160578 11.86299506 15.1310351
WORD_COUNT.log             4.28080533  4.3368132  2.19988328  6.4535804
VARIETY                    1.89480225  1.9730096  1.00496698  3.1065494
SYNT_FUN.confl             2.63116152  2.7921912  0.10110122  5.0710036
STRESS_LAST_SYLL          17.21235294 17.1881314 15.31306868 19.0004399
SEG_S                      6.72334641  6.7311847  4.37666027  8.9601071
SEG_A                      5.99421647  6.0866438  2.93829494  7.7660516
SEG_DIFF                   5.34021651  5.3714519  3.05801860  7.9290785
RHY_S                      1.02660591  0.8626261 -0.22760646  2.6753267
RHY_A                      7.35328277  7.3631153  5.84394696  8.7590434
RHY_DIFF                   5.69787288  5.7327761  3.44364789  7.0657322
READABILITY.log            5.15818113  5.1819261  3.43467867  6.6844643
PERSISTENCE                2.98263567  3.0849548 -0.05460303  5.5023577
PERSIST_FORM.1000          0.56706758  0.7603471 -0.77446357  1.3490702
NEWSPAPER                  1.17849512  1.3634910 -1.05759155  2.6664365
LEXDIV.win                 6.65022033  6.6921415  3.50813903  9.4436205
FORM                       0.70913984  0.8126717 -1.45905062  2.6178364
FINAL_SEGMENT             13.24988218 13.2527389 11.85797181 14.7474660
DPNOFREQ                  10.37066890 10.3572321  8.84269509 12.3910781
COMPL.confl                0.67879077  0.6255756 -1.04513401  2.1400496
ADVMOD                     0.03530495  0.1743451 -1.29804535  0.9400113
ADJ_LEN                   23.24500180 23.1763745 21.63927155 25.2229926
                            normHits  decision
VARIETYxSYNT_FUN.confl    0.87939698 Confirmed
VARIETYxSTRESS_LAST_SYLL  1.00000000 Confirmed
VARIETYxPERSIST_FORM.1000 0.51758794 Tentative
VARIETYxNEWSPAPER         0.02512563  Rejected
VARIETYxFORM              0.45226131 Tentative
VARIETYxCOMPL.confl       0.44221106 Tentative
VARIETYxADVMOD            0.44221106 Tentative
VARIETYxZIPF_FREQ.cat     1.00000000 Confirmed
VARIETYxSEG_S.cat         0.98994975 Confirmed
VARIETYxRHY_S.cat         0.54271357 Tentative
VARIETYxRHY_A.cat         1.00000000 Confirmed
VARIETYxRHY_DIFF.cat      0.98492462 Confirmed
VARIETYxPERSISTENCE.cat   0.38190955 Tentative
VARIETYxFINAL_SEGMENT.cat 1.00000000 Confirmed
VARIETYxDPNOFREQ.cat      1.00000000 Confirmed
VARIETYxADJ_LEN.cat       1.00000000 Confirmed
ZIPF_FREQ.trans           1.00000000 Confirmed
WORD_COUNT.log            0.95477387 Confirmed
VARIETY                   0.06030151  Rejected
SYNT_FUN.confl            0.56281407 Tentative
STRESS_LAST_SYLL          1.00000000 Confirmed
SEG_S                     1.00000000 Confirmed
SEG_A                     0.98994975 Confirmed
SEG_DIFF                  0.98994975 Confirmed
RHY_S                     0.01005025  Rejected
RHY_A                     1.00000000 Confirmed
RHY_DIFF                  1.00000000 Confirmed
READABILITY.log           0.98994975 Confirmed
PERSISTENCE               0.66331658 Confirmed
PERSIST_FORM.1000         0.00000000  Rejected
NEWSPAPER                 0.01507538  Rejected
LEXDIV.win                1.00000000 Confirmed
FORM                      0.00000000  Rejected
FINAL_SEGMENT             1.00000000 Confirmed
DPNOFREQ                  1.00000000 Confirmed
COMPL.confl               0.00000000  Rejected
ADVMOD                    0.00000000  Rejected
ADJ_LEN                   1.00000000 Confirmed
collector <- matrix(rep(NA, 60), ncol=10, dimnames=list(   
  NTREE=ntree.vals <- c(500, 1000, 1500, 2000, 2500, 3000),  
  MTRY=mtry.vals <- 1:10)) 
for(k in seq(ntree.vals)){       
  for(j in seq(mtry.vals)){  
    seedy <- set.seed(sum(utf8ToInt("All the Young Dudes")))  
    collector[k,j] <- randomForest(   
      COMPARISON ~           
        ADJ_LEN + DPNOFREQ + FINAL_SEGMENT + LEXDIV.win + PERSISTENCE + READABILITY.log + 
        RHY_DIFF + RHY_A + SEG_DIFF + SEG_A + SEG_S + STRESS_LAST_SYLL + SYNT_FUN.confl + 
        WORD_COUNT.log + ZIPF_FREQ.trans + VARIETYxADJ_LEN.cat + VARIETYxDPNOFREQ.cat + 
        VARIETYxFINAL_SEGMENT.cat + VARIETYxRHY_DIFF.cat + VARIETYxRHY_A.cat + VARIETYxRHY_S.cat + 
        VARIETYxSEG_S.cat + VARIETYxZIPF_FREQ.cat + VARIETYxADVMOD + VARIETYxCOMPL.confl +
        VARIETYxPERSIST_FORM.1000 + VARIETYxSTRESS_LAST_SYLL + VARIETYxSYNT_FUN.confl, 
      data=x,    
      ntree=ntree.vals[k], mtry=mtry.vals[j],
      importance=TRUE)$err.rate[ntree.vals[k], "OOB"] 
    #cat(seedy, ntree.vals[k], mtry.vals[j], collector[k,j], "\n", sep="\t")
  }
}
mtry <- which(t(collector)==min(t(collector)), arr.ind=TRUE)[1,1] 
ntree <- which(t(collector)==min(t(collector)), arr.ind=TRUE)[1,2]*500 

smallest collector for mtry=5 and ntree=500

5 The Model

set.seed(sum(utf8ToInt("All the Young Dudes")))
(rf.1 <- randomForest(COMPARISON ~ 
                        ADJ_LEN + DPNOFREQ + FINAL_SEGMENT + LEXDIV.win + PERSISTENCE + READABILITY.log + 
                        RHY_DIFF + RHY_A + SEG_DIFF + SEG_A + SEG_S + STRESS_LAST_SYLL + SYNT_FUN.confl + 
                        WORD_COUNT.log + ZIPF_FREQ.trans + VARIETYxADJ_LEN.cat + VARIETYxDPNOFREQ.cat + 
                        VARIETYxFINAL_SEGMENT.cat + VARIETYxRHY_DIFF.cat + VARIETYxRHY_A.cat + VARIETYxRHY_S.cat + 
                        VARIETYxSEG_S.cat + VARIETYxZIPF_FREQ.cat + VARIETYxADVMOD + VARIETYxCOMPL.confl + 
                        VARIETYxPERSIST_FORM.1000 + VARIETYxSTRESS_LAST_SYLL + VARIETYxSYNT_FUN.confl, 
                      data=x,
                      ntree=500,
                      mtry=5,
                      importance=TRUE))

Call:
 randomForest(formula = COMPARISON ~ ADJ_LEN + DPNOFREQ + FINAL_SEGMENT +      LEXDIV.win + PERSISTENCE + READABILITY.log + RHY_DIFF + RHY_A +      SEG_DIFF + SEG_A + SEG_S + STRESS_LAST_SYLL + SYNT_FUN.confl +      WORD_COUNT.log + ZIPF_FREQ.trans + VARIETYxADJ_LEN.cat +      VARIETYxDPNOFREQ.cat + VARIETYxFINAL_SEGMENT.cat + VARIETYxRHY_DIFF.cat +      VARIETYxRHY_A.cat + VARIETYxRHY_S.cat + VARIETYxSEG_S.cat +      VARIETYxZIPF_FREQ.cat + VARIETYxADVMOD + VARIETYxCOMPL.confl +      VARIETYxPERSIST_FORM.1000 + VARIETYxSTRESS_LAST_SYLL + VARIETYxSYNT_FUN.confl,      data = x, ntree = 500, mtry = 5, importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 5

        OOB estimate of  error rate: 1.79%
Confusion matrix:
          analytic synthetic class.error
analytic       113         4  0.03418803
synthetic        4       325  0.01215805
x$PREDS.NUM.rf1 <- predict(
  rf.1, 
  type="prob")[,"synthetic"]
x$PREDS.CAT.rf1 <- predict(rf.1)
x$PREDS.NUM.rf1.obs <- ifelse(  # the probability with wich the observed variant was chosen
  x$COMPARISON=="synthetic",
  x$PREDS.NUM.rf1,
  1-x$PREDS.NUM.rf1)
logloss <- mean(-log(x$PREDS.NUM.rf1.obs)) # 0.084
(c.m <- table(OBS=x$COMPARISON, PREDS=x$PREDS.CAT.rf1)); c( # confusion matrix & its eval
   "Class. acc."    =mean(x$COMPARISON==x$PREDS.CAT.rf1, na.rm=TRUE),
   "Prec. for synthetic"=c.m["synthetic","synthetic"] / sum(c.m[       ,"synthetic"]),
   "Rec. for synthetic" =c.m["synthetic","synthetic"] / sum(c.m["synthetic",]),
   "Prec. for analytic"=c.m["analytic","analytic"] / sum(c.m[       ,"analytic"]),
   "Rec. for analytic" =c.m["analytic","analytic"] / sum(c.m["analytic",]))
           PREDS
OBS         analytic synthetic
  analytic       113         4
  synthetic        4       325
        Class. acc. Prec. for synthetic  Rec. for synthetic  Prec. for analytic 
          0.9820628           0.9878419           0.9878419           0.9658120 
  Rec. for analytic 
          0.9658120 
(varimps <- rf.1$importance)[,3:4]
                          MeanDecreaseAccuracy MeanDecreaseGini
ADJ_LEN                           1.034095e-01       37.1440855
DPNOFREQ                          7.585457e-03        3.1235906
FINAL_SEGMENT                     1.276964e-02        8.4235328
LEXDIV.win                        3.715063e-03        3.0412576
PERSISTENCE                       1.380123e-03        1.8226580
READABILITY.log                   2.489495e-03        1.8775345
RHY_DIFF                          1.846570e-03        0.9541832
RHY_A                             2.602820e-03        1.3885271
SEG_DIFF                          2.796271e-03        2.9088197
SEG_A                             1.957821e-03        1.7146716
SEG_S                             3.559805e-03        2.6198574
STRESS_LAST_SYLL                  4.358566e-02       20.6168262
SYNT_FUN.confl                    5.259399e-04        0.2920680
WORD_COUNT.log                    1.652742e-03        2.0106882
ZIPF_FREQ.trans                   1.679121e-02        6.5149179
VARIETYxADJ_LEN.cat               7.738788e-02       29.1257391
VARIETYxDPNOFREQ.cat              4.855285e-03        1.9980827
VARIETYxFINAL_SEGMENT.cat         1.505212e-02       12.2140002
VARIETYxRHY_DIFF.cat              1.702025e-03        1.0668374
VARIETYxRHY_A.cat                 3.789232e-03        2.1697736
VARIETYxRHY_S.cat                 2.998041e-04        0.3283484
VARIETYxSEG_S.cat                 3.149952e-03        1.1055828
VARIETYxZIPF_FREQ.cat             1.531515e-02        9.2415441
VARIETYxADVMOD                    1.792046e-05        0.2107844
VARIETYxCOMPL.confl               2.563686e-04        0.2171749
VARIETYxPERSIST_FORM.1000         1.436808e-03        1.1977105
VARIETYxSTRESS_LAST_SYLL          3.847270e-02       18.0130404
VARIETYxSYNT_FUN.confl            1.366823e-03        0.6454828
dotchart(sort(varimps[,1]), pch=4, xlab='Mean Decrease in Accuracy', main='Variable Importance Plot')

confusionMatrix(x$PREDS.CAT.rf1, x$COMPARISON)
Confusion Matrix and Statistics

           Reference
Prediction  analytic synthetic
  analytic       113         4
  synthetic        4       325
                                         
               Accuracy : 0.9821         
                 95% CI : (0.965, 0.9922)
    No Information Rate : 0.7377         
    P-Value [Acc > NIR] : <2e-16         
                                         
                  Kappa : 0.9537         
                                         
 Mcnemar's Test P-Value : 1              
                                         
            Sensitivity : 0.9658         
            Specificity : 0.9878         
         Pos Pred Value : 0.9658         
         Neg Pred Value : 0.9878         
             Prevalence : 0.2623         
         Detection Rate : 0.2534         
   Detection Prevalence : 0.2623         
      Balanced Accuracy : 0.9768         
                                         
       'Positive' Class : analytic       
                                         

6 Plots

Save Variable importance scores

#png("03a_VarIMPPlot.png", width=40, height=30, units="cm", res=300)
dotchart(sort(varimps[,1]), pch=4, xlab='Mean Decrease in Accuracy', main='Variable Importance Plot')

#dev.off()

6.1 Plot the most important variables

Plot for ADJ_LEN:

(pd.cas <- partial(    # make pd.c contain partial dependence scores
   object=rf.1,        # from this forest
   pred.var="ADJ_LEN", # for this predictor
   which.class=2,      # for the 2nd level of the response
   train=x,
   prob=TRUE)) 
   ADJ_LEN      yhat
1        3 0.8152601
2        4 0.8152601
3        5 0.8100628
4        6 0.6037713
5        7 0.5324574
6        8 0.4989013
7        9 0.4963901
8       10 0.4961839
9       11 0.4961839
10      12 0.4961839
11      13 0.4961839
tab.cas <- prop.table(table(x$ADJ_LEN))

#png("03b_pd-adjlen.png", widt=18, height=13, units="cm", res=300)
plot(
  main="Partial dep. of COMPARISON on ADJ_LEN",
  type="b", pch=16,
  xlab="Adjective Length in Characters",
  ylab=substitute(paste('Probability of ', italic('synthetic'), ' comparison')),
  ylim=c(0,1),
  x=pd.cas$ADJ_LEN,
  y=pd.cas$yhat,
  cex=1+tab.cas*10
)
#abline(h=partial(object=rf.1, pred.var = 'VARIETY', which.class=2, train=x, prob=TRUE)[1,2], lty=2)
abline(h=sum(x$COMPARISON=='synthetic')/nrow(x), lty=2)
lines(lowess(pd.cas$yhat ~ pd.cas$ADJ_LEN), lwd=6, col="#BCBCBCB0")

#dev.off()

Plot for VARIETYxADJ_LEN.cat:

(pd.corpustrigger <- partial(object=rf.1,
                             pred.var='VARIETYxADJ_LEN.cat',
                             which.class=2, train=x,
                             prob=TRUE))
  VARIETYxADJ_LEN.cat      yhat
1           BrE:(0,4] 0.7914305
2           BrE:(4,5] 0.7831570
3           BrE:(5,7] 0.6180807
4          BrE:(7,13] 0.5765336
5            LK:(0,4] 0.7983812
6            LK:(4,5] 0.7775516
7            LK:(5,7] 0.6371570
8           LK:(7,13] 0.5821973
b <- matrix(data=pd.corpustrigger$yhat, nrow=4)
rownames(b) <- levels(x$ADJ_LEN.cat)
colnames(b) <- levels(x$VARIETY)

#png("03c_pd-varietyxadjlen.png", widt=18, height=13, units="cm", res=300)
plot(x=0, ylim=c(0.5,1), xlim=c(0,3), xaxt='n', bty='n', pch='', 
     xlab='Adjective length in characters',
     ylab=substitute(paste('Probability of ', italic('synthetic'), ' comparison')),
     main='Partial dep. of COMPARISON on VARIETYxADJ_LEN.cat',
     cex.main=1.5);grid()
axis(1, at=0:3, labels=levels(x$ADJ_LEN.cat))
#abline(h=partial(object=rf.1, pred.var='VARIETY', which.class=2, train=x, prob=TRUE)[1,2], lty=2)
abline(h=sum(x$COMPARISON=='synthetic')/nrow(x), lty=2)
points(x=0:3, y=b[,2], pch=16, col=alpha('#004F86', alpha=0.7), cex=3)
points(x=0:3, y=b[,1], pch=16, col=alpha('#7BC8FF', alpha=0.7), cex=3)
legend('top', legend=c('BrE', 'SLE'), fill=c('#7BC8FF', '#004F86'), ncol=2, xjust=0.5, yjust=0.5)

#dev.off()

Plot for STRESS_LAST_SYLL:

(pd.corpustrigger <- partial(object=rf.1,
                             pred.var='STRESS_LAST_SYLL',
                             which.class=2, train=x,
                             prob=TRUE))
  STRESS_LAST_SYLL      yhat
1                n 0.6305247
2                y 0.7606323
b <- matrix(data=pd.corpustrigger$yhat, nrow=2)
rownames(b) <- levels(x$STRESS_LAST_SYLL)


#png("03d_pd-stresslastsyll.png", width=22.5, height=15, units="cm", res=300)
plot(x=0, ylim=c(0.5,1), xlim=c(0,1), xaxt='n', bty='n', pch='', 
     xlab='Stress on the last syllable of the adjective lemma',
     ylab=substitute(paste('Probability of ', italic('synthetic'), 'comparison')),
     main='Partial dep. of COMPARISON on STRESS_LAST_SYLL',
     cex.main=1.5);grid()
axis(1, at=0:1, labels=c('no', 'yes'))
#abline(h=partial(object=rf.1, pred.var='VARIETY', which.class=2, train=x, prob=TRUE)[1,2], lty=2)
abline(h=sum(x$COMPARISON=='synthetic')/nrow(x), lty=2)
points(x=0:1, y=b[,1], pch=16, col=alpha('#004F86', alpha=0.7), cex=3)

#dev.off()

Plot for VARIETYxSTRESS_LAST_SYLL:

(pd.corpustrigger <- partial(object=rf.1,
                             pred.var='VARIETYxSTRESS_LAST_SYLL',
                             which.class=2, train=x,
                             prob=TRUE))
  VARIETYxSTRESS_LAST_SYLL      yhat
1                    BrE:n 0.6576906
2                    BrE:y 0.7624126
3                     LK:n 0.6456278
4                     LK:y 0.7597489
b <- matrix(data=pd.corpustrigger$yhat, nrow=2)
rownames(b) <- levels(x$STRESS_LAST_SYLL)
colnames(b) <- levels(x$VARIETY)

#png("03e_pd-varietyxstresslastsyll.png", widt=18, height=13, units="cm", res=300)
plot(x=0, ylim=c(0.5,1), xlim=c(0,1), xaxt='n', bty='n', pch='', 
     xlab='Stress on the last syllable of the adjective lemma',
     ylab=substitute(paste('Probability of ', italic('synthetic'), ' comparison')),
     main='Partial dep. of COMPARISON on VARIETYxSTRESS_LAST_SYLL',
     cex.main=1);grid()
axis(1, at=0:1, labels=c('no', 'yes'))
#abline(h=partial(object=rf.1, pred.var='VARIETY', which.class=2, train=x, prob=TRUE)[1,2], lty=2)
abline(h=sum(x$COMPARISON=='synthetic')/nrow(x), lty=2)
points(x=0:1, y=b[,2], pch=16, col=alpha('#004F86', alpha=0.7), cex=3)
points(x=0:1, y=b[,1], pch=16, col=alpha('#7BC8FF', alpha=0.7), cex=3)
legend('top', legend=c('BrE', 'SLE'), fill=c('#7BC8FF', '#004F86'), ncol=2, xjust=0.5, yjust=0.5)

#dev.off()

Plot for VAREITYxZIPF_FREQ.cat:

(pd.corpustrigger <- partial(object=rf.1,
                             pred.var='VARIETYxZIPF_FREQ.cat',
                             which.class=2, train=x,
                             prob=TRUE))
  VARIETYxZIPF_FREQ.cat      yhat
1     BrE:[-0.5,13.293] 0.7229193
2   BrE:(13.293,14.756] 0.7448251
3   BrE:(14.756,14.942] 0.7315830
4       BrE:(14.942,24] 0.7465830
5      LK:[-0.5,13.293] 0.6843632
6    LK:(13.293,14.756] 0.7469372
7    LK:(14.756,14.942] 0.7244439
8        LK:(14.942,24] 0.7456099
b <- matrix(data=pd.corpustrigger$yhat, nrow=4)
rownames(b) <- levels(x$ZIPF_FREQ.cat)
colnames(b) <- levels(x$VARIETY)

#png("03f_pd-varietyxfrequency.png", widt=18, height=13, units="cm", res=300)
plot(x=0, ylim=c(0.5,1), xlim=c(-0.2,3.2), xaxt='n', bty='n', pch='', 
     xlab='Frequency of the Adjective Lemma',
     ylab=substitute(paste('Probability of ', italic('synthetic'), ' comparison')),
     main='Partial dep. of COMPARISON on VARIETYxZIPF_FREQ.cat',
     cex.main=1);grid()
axis(1, at=0:3, labels=c("[0,13.293]", levels(x$ZIPF_FREQ.cat)[2:4]))
#abline(h=partial(object=rf.1, pred.var='VARIETY', which.class=2, train=x, prob=TRUE)[1,2], lty=2)
abline(h=sum(x$COMPARISON=='synthetic')/nrow(x), lty=2)
points(x=c(0.05, 1.05, 2.05, 3.05), y=b[,2], pch=16, col=alpha('#004F86', alpha=0.7), cex=3)
points(x=c(-0.05, 0.95, 1.95, 2.95), y=b[,1], pch=16, col=alpha('#7BC8FF', alpha=0.7), cex=3)
legend('top', legend=c('BrE', 'SLE'), fill=c('#7BC8FF', '#004F86'), ncol=2, xjust=0.5, yjust=0.5)

#dev.off()

Plot for VARIETYxFINAL_SEGMENT.cat:

(pd.corpustrigger <- partial(object=rf.1,
                             pred.var='VARIETYxFINAL_SEGMENT.cat',
                             which.class=2, train=x,
                             prob=TRUE))
   VARIETYxFINAL_SEGMENT.cat      yhat
1                      BrE:0 0.7511256
2              BrE:(0,0.333] 0.7414215
3          BrE:(0.333,0.625] 0.7367668
4          BrE:(0.625,0.667] 0.6889058
5              BrE:(0.667,1] 0.7008117
6                       LK:0 0.7480493
7               LK:(0,0.333] 0.7569148
8           LK:(0.333,0.625] 0.7343363
9           LK:(0.625,0.667] 0.7314350
10              LK:(0.667,1] 0.6906637
b <- matrix(data=pd.corpustrigger$yhat, nrow=5)
rownames(b) <- levels(x$FINAL_SEGMENT.cat)
colnames(b) <- levels(x$VARIETY)

#png("03g_pd-varietyxfinalsegment.png", widt=18, height=13, units="cm", res=300)
plot(x=0, ylim=c(0.5,1), xlim=c(-0.2,4.2), xaxt='n', bty='n', pch='', 
     xlab='Similarity of the Final Segment to the Synthetic Ending',
     ylab=substitute(paste('Probability of ', italic('synthetic'), ' comparison')),
     main='Partial dep. of COMPARISON on VARIETYxFINAL_SEGMENT.cat',
     cex.main=1);grid()
axis(1, at=0:4, labels=levels(x$FINAL_SEGMENT.cat))
#abline(h=partial(object=rf.1, pred.var='VARIETY', which.class=2, train=x, prob=TRUE)[1,2], lty=2)
abline(h=sum(x$COMPARISON=='synthetic')/nrow(x), lty=2)
points(x=c(0.05,1.05,2.05,3.05,4.05), y=b[,2], pch=16, col=alpha('#004F86', alpha=0.7), cex=3)
points(x=c(-0.05,0.95,1.95,2.95,3.95), y=b[,1], pch=16, col=alpha('#7BC8FF', alpha=0.7), cex=3)
legend('top', legend=c('BrE', 'SLE'), fill=c('#7BC8FF', '#004F86'), ncol=2, xjust=0.5, yjust=0.5)

#dev.off()
sessionInfo()
R version 4.3.0 (2023-04-21 ucrt)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 11 x64 (build 26100)

Matrix products: default


locale:
[1] LC_COLLATE=German_Germany.utf8  LC_CTYPE=German_Germany.utf8   
[3] LC_MONETARY=German_Germany.utf8 LC_NUMERIC=C                   
[5] LC_TIME=German_Germany.utf8    

time zone: Europe/Berlin
tzcode source: internal

attached base packages:
[1] grid      stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] car_3.1-2            carData_3.0-5        tree_1.0-43         
 [4] randomForest_4.7-1.1 pdp_0.8.1            partykit_1.2-20     
 [7] mvtnorm_1.2-2        libcoin_1.0-9        MASS_7.3-58.4       
[10] dplyr_1.1.2          caret_6.0-94         lattice_0.21-8      
[13] ggplot2_3.5.0        Boruta_8.0.0        

loaded via a namespace (and not attached):
 [1] tidyselect_1.2.0     timeDate_4022.108    farver_2.1.1        
 [4] fastmap_1.1.1        pROC_1.18.4          digest_0.6.31       
 [7] rpart_4.1.19         timechange_0.2.0     lifecycle_1.0.3     
[10] survival_3.5-5       magrittr_2.0.3       compiler_4.3.0      
[13] rlang_1.1.1          tools_4.3.0          utf8_1.2.3          
[16] yaml_2.3.7           data.table_1.14.8    knitr_1.50          
[19] htmlwidgets_1.6.2    plyr_1.8.8           abind_1.4-5         
[22] withr_2.5.0          purrr_1.0.1          nnet_7.3-18         
[25] stats4_4.3.0         fansi_1.0.4          e1071_1.7-16        
[28] colorspace_2.1-0     future_1.33.0        globals_0.16.2      
[31] scales_1.3.0         iterators_1.0.14     cli_3.6.1           
[34] inum_1.0-5           rmarkdown_2.22       generics_0.1.3      
[37] rstudioapi_0.14      future.apply_1.11.0  reshape2_1.4.4      
[40] proxy_0.4-27         stringr_1.5.0        splines_4.3.0       
[43] parallel_4.3.0       vctrs_0.6.3          hardhat_1.3.0       
[46] Matrix_1.5-4         jsonlite_1.8.5       Formula_1.2-5       
[49] listenv_0.9.0        foreach_1.5.2        gower_1.0.1         
[52] recipes_1.0.6        glue_1.6.2           parallelly_1.36.0   
[55] codetools_0.2-19     lubridate_1.9.2      stringi_1.7.12      
[58] gtable_0.3.3         munsell_0.5.0        tibble_3.2.1        
[61] pillar_1.9.0         htmltools_0.5.5      ipred_0.9-14        
[64] lava_1.7.2.1         R6_2.5.1             evaluate_0.21       
[67] class_7.3-21         Rcpp_1.0.10          nlme_3.1-162        
[70] prodlim_2023.03.31   ranger_0.16.0        xfun_0.51           
[73] ModelMetrics_1.2.2.2 pkgconfig_2.0.3     

References

Mondorf, B. (2014). Apparently competing motivations in morpho-syntactic variation. in E. A. Moravcsik, A. Malchukov, & B. MacWhinney (eds.), Competing motivations in grammar and usage (pp. 209–228). Oxford University Press.